The left overs from Bankcruptcy Detection

In [ ]:
strs = ""
excludedCols = ["cvrNummer","label","status","navn","kortBeskrivelse"]
for i in logDf.columns:
    if i not in excludedCols:
        strs += i+" + "

#excludedCols    
imputedDf = logDf.fillna(value=0.0)
formula = RFormula(formula="label ~ "+strs[:-3],labelCol="label")

glr = GeneralizedLinearRegression(family="binomial", link="logit", maxIter=10, regParam=0.3)
standardScale = StandardScaler(withMean=True,withStd=True,inputCol=glr.getFeaturesCol(),outputCol="scaledFeatures")


pipeline = Pipeline(stages=[formula,standardScale,glr])

grid = (ParamGridBuilder()
        .baseOn({lr.predictionCol:"prediction"})
        .baseOn({lr.rawPredictionCol:"rawPrediction"})
        .baseOn({lr.probabilityCol:"probability"})
        .baseOn({lr.labelCol:"label"})
        .baseOn({lr.featuresCol:"features"})
        .addGrid(param=lr.elasticNetParam,values=[0.1,1.0])
        .addGrid(param=lr.getMaxIter,values=[10])
        .build()
       )

evaluate = BinaryClassificationEvaluator()

trainEvalModel = TrainValidationSplit(estimator=pipeline,estimatorParamMaps=grid,evaluator=evaluate,trainRatio=0.8)

In [ ]:
cols = [i for i in logDf.columns if i not in excludedCols]+["label"]

model = pipeline.fit(imputedDf.filter(F.col("label") <= 1).select(*cols))

In [ ]:
predict = model.transform(imputedDf.select(*cols).filter(F.col("label") <= 1))
coef = model.stages[-1]

In [ ]:
p = model.stages[-1].summary

print("Coefficient Standard Errors: " + str(p.coefficientStandardErrors))
print("T Values: " + str(p.tValues))
print("P Values: " + str(p.pValues))
print("Dispersion: " + str(p.dispersion))
print("Null Deviance: " + str(p.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(p.residualDegreeOfFreedomNull))
print("Deviance: " + str(p.deviance))
print("Residual Degree Of Freedom: " + str(p.residualDegreeOfFreedom))
print("AIC: " + str(p.aic))
print("Deviance Residuals: ")
p.residuals().show()

In [ ]:
print(len(cols))
print(type(coef.coefficients.toArray()))
print()
      
      
summary = {"Labels":cols[:-1]+["intercept"],"Coefficients":np.insert(coef.coefficients.toArray(),0,coef.intercept),"coefficient Std Err":p.coefficientStandardErrors,"T Values":p.tValues,"P Values":p.pValues}

In [ ]:
pd.options.display.float_format = '{:,.4f}'.format
df = pd.DataFrame(summary,columns=["Labels","Coefficients","coefficient Std Err","T Values","P Values"])
import subprocess

HEADER = '''
<html>
    <head>
        <style>
            .df tbody 
        </style>
    </head>
    <body>
'''
FOOTER = '''
    </body>
</html>
'''

#df = pd.DataFrame({'a': np.arange(10), 'b': np.random.randn(10)})
with open('test.html', 'w') as f:
    f.write(HEADER)
    f.write(df.to_html(classes='df'))
    f.write(FOOTER)

In [ ]: